import os
import gzip
from collections import defaultdict
import pybedtools
from numpy import *


assembly = "hg38"

libraries = []
directory = "/osc-fs_home/mdehoon/Data/CASPARs/CAGE/Mapping/"
filenames = os.listdir(directory)
for filename in filenames:
    library, extension = filename.split(".")
    assert extension == "bam"
    libraries.append(library)

libraries.sort()
m = len(libraries)


directory = "/osc-fs_home/scratch/mdehoon/Data/Genomes"
filename = "%s.chrom.sizes" % assembly
path = os.path.join(directory, assembly, filename)
print("Reading", path)
stream = open(path)
sizes = {}
for line in stream:
    chromosome, size = line.split()
    sizes[chromosome] = int(size)
stream.close()

directory = "/osc-fs_home/mdehoon/Data/Fantom6/FANTOMCAT"
filename = "F6_CAT.promoter.bed.gz"
path = os.path.join(directory, filename)
print("Reading", path)
stream = gzip.open(path, "rt")
promoters = pybedtools.BedTool(stream)
loci = defaultdict(list)
n = 0
for promoter in promoters:
    chromosome = promoter.chrom
    promoter.score = str(n)
    loci[chromosome].append(promoter)
    n += 1
stream.close()

counts = zeros((n, m))

for j, library in enumerate(libraries):
    filename = "%s.ctss.bed" % library
    print("Reading", filename)
    stream = open(filename)
    lines = pybedtools.BedTool(stream)
    current = None
    for line in lines:
        if line.chrom != current:
            if current is not None:
                for promoter in loci[current]:
                    start = promoter.start
                    end = promoter.end
                    strand = promoter.strand
                    i = int(promoter.score)
                    counts[i,j] = sum(data[strand][start:end])
            current = line.chrom
            size = sizes[current]
            data = {'+': zeros(size), '-': zeros(size)}
        strand = line.strand
        position = line.start
        count = float(line.score)
        data[strand][position] += count
    stream.close()
    for promoter in loci[current]:
        start = promoter.start
        end = promoter.end
        strand = promoter.strand
        i = int(promoter.score)
        counts[i,j] = sum(data[strand][start:end])

filename = "promoters.FANTOM_CAT.THP-1.counts.txt"
print("Writing", filename)
output = open(filename, 'w')
output.write("promoter")
for library in libraries:
    output.write("\t%s" % library)
output.write("\n")
n = 0
for chromosome in loci:
    for promoter in loci[chromosome]:
        assert int(promoter.score) == n
        output.write(promoter.name)
        for value in counts[n, :]:
            output.write("\t%d" % value)
        output.write("\n")
        n += 1
output.close()
